In [3]:
import pandas as pd
In [11]:
df = pd.read_csv(r'C:\Machine Learning\Datasets\weatherAUS.csv')
print(df)
              Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  \
0       2008-12-01   Albury     13.4     22.9       0.6          NaN   
1       2008-12-02   Albury      7.4     25.1       0.0          NaN   
2       2008-12-03   Albury     12.9     25.7       0.0          NaN   
3       2008-12-04   Albury      9.2     28.0       0.0          NaN   
4       2008-12-05   Albury     17.5     32.3       1.0          NaN   
...            ...      ...      ...      ...       ...          ...   
145455  2017-06-21    Uluru      2.8     23.4       0.0          NaN   
145456  2017-06-22    Uluru      3.6     25.3       0.0          NaN   
145457  2017-06-23    Uluru      5.4     26.9       0.0          NaN   
145458  2017-06-24    Uluru      7.8     27.0       0.0          NaN   
145459  2017-06-25    Uluru     14.9      NaN       0.0          NaN   

        Sunshine WindGustDir  WindGustSpeed WindDir9am  ... Humidity9am  \
0            NaN           W           44.0          W  ...        71.0   
1            NaN         WNW           44.0        NNW  ...        44.0   
2            NaN         WSW           46.0          W  ...        38.0   
3            NaN          NE           24.0         SE  ...        45.0   
4            NaN           W           41.0        ENE  ...        82.0   
...          ...         ...            ...        ...  ...         ...   
145455       NaN           E           31.0         SE  ...        51.0   
145456       NaN         NNW           22.0         SE  ...        56.0   
145457       NaN           N           37.0         SE  ...        53.0   
145458       NaN          SE           28.0        SSE  ...        51.0   
145459       NaN         NaN            NaN        ESE  ...        62.0   

        Humidity3pm  Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  \
0              22.0       1007.7       1007.1       8.0       NaN     16.9   
1              25.0       1010.6       1007.8       NaN       NaN     17.2   
2              30.0       1007.6       1008.7       NaN       2.0     21.0   
3              16.0       1017.6       1012.8       NaN       NaN     18.1   
4              33.0       1010.8       1006.0       7.0       8.0     17.8   
...             ...          ...          ...       ...       ...      ...   
145455         24.0       1024.6       1020.3       NaN       NaN     10.1   
145456         21.0       1023.5       1019.1       NaN       NaN     10.9   
145457         24.0       1021.0       1016.8       NaN       NaN     12.5   
145458         24.0       1019.4       1016.5       3.0       2.0     15.1   
145459         36.0       1020.2       1017.9       8.0       8.0     15.0   

        Temp3pm  RainToday  RainTomorrow  
0          21.8         No            No  
1          24.3         No            No  
2          23.2         No            No  
3          26.5         No            No  
4          29.7         No            No  
...         ...        ...           ...  
145455     22.4         No            No  
145456     24.5         No            No  
145457     26.1         No            No  
145458     26.0         No            No  
145459     20.9         No           NaN  

[145460 rows x 23 columns]
In [12]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null   float64
 18  Cloud3pm       86102 non-null   float64
 19  Temp9am        143693 non-null  float64
 20  Temp3pm        141851 non-null  float64
 21  RainToday      142199 non-null  object 
 22  RainTomorrow   142193 non-null  object 
dtypes: float64(16), object(7)
memory usage: 25.5+ MB
In [13]:
df.dropna(subset = ['RainToday','RainTomorrow'], inplace=True)
In [14]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 140787 entries, 0 to 145458
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           140787 non-null  object 
 1   Location       140787 non-null  object 
 2   MinTemp        140319 non-null  float64
 3   MaxTemp        140480 non-null  float64
 4   Rainfall       140787 non-null  float64
 5   Evaporation    81093 non-null   float64
 6   Sunshine       73982 non-null   float64
 7   WindGustDir    131624 non-null  object 
 8   WindGustSpeed  131682 non-null  float64
 9   WindDir9am     131127 non-null  object 
 10  WindDir3pm     137117 non-null  object 
 11  WindSpeed9am   139732 non-null  float64
 12  WindSpeed3pm   138256 non-null  float64
 13  Humidity9am    139270 non-null  float64
 14  Humidity3pm    137286 non-null  float64
 15  Pressure9am    127044 non-null  float64
 16  Pressure3pm    127018 non-null  float64
 17  Cloud9am       88162 non-null   float64
 18  Cloud3pm       84693 non-null   float64
 19  Temp9am        140131 non-null  float64
 20  Temp3pm        138163 non-null  float64
 21  RainToday      140787 non-null  object 
 22  RainTomorrow   140787 non-null  object 
dtypes: float64(16), object(7)
memory usage: 25.8+ MB
In [18]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
In [19]:
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] =14
matplotlib.rcParams['figure.figsize'] =(10,6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'         
In [21]:
df.Location.nunique()
Out[21]:
49
In [23]:
import plotly.express as px
In [24]:
px.histogram(df,x='Location',title='Location vs. Rainy Days', color = 'RainToday')
In [28]:
px.histogram(df,x='Temp3pm',title='Tempat3PM vs. Raintomorrow',color='RainTomorrow')
In [32]:
px.histogram(df,x='RainTomorrow',title='RainToday vs. RainTomorrow',color='RainToday')
In [34]:
px.scatter(df.sample(2000),title = 'Min Temp vs Max Temp' , x = 'MinTemp',y='MaxTemp',color='RainToday')
In [35]:
from sklearn.model_selection import train_test_split
In [36]:
train_val_df, test_df = train_test_split(df,test_size=0.2, random_state=42)
train_df,val_df = train_test_split(train_val_df,test_size=0.25,random_state=42)
In [38]:
print('train_df.shape:' , train_df.shape)
print('val_df.shape:' ,  train_df.shape)
print('test_df.shape:' , train_df.shape)
train_df.shape: (84471, 23)
val_df.shape: (84471, 23)
test_df.shape: (84471, 23)
In [40]:
plt.title('No of rows per year')
sns.countplot(x=pd.to_datetime(df.Date).dt.year);
In [45]:
year = pd.to_datetime(df.Date).dt.year
In [46]:
train_df = df[year<2015]
val_df = df[year==2015]
test_df = df[year>2015]
In [47]:
input_col = list(train_df.columns)[1:-1]
target_col = 'RainTomorrow'
In [48]:
print(input_col)
['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday']
In [49]:
target_col
Out[49]:
'RainTomorrow'
In [51]:
train_inputs = train_df[input_col].copy()
train_targets = train_df[target_col].copy()
In [52]:
val_inputs = val_df[input_col].copy()
val_targets = val_df[target_col].copy()
In [53]:
test_inputs = test_df[input_col].copy()
test_targets = test_df[target_col].copy()
In [55]:
train_inputs
Out[55]:
Location MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustDir WindGustSpeed WindDir9am WindDir3pm ... WindSpeed3pm Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm RainToday
0 Albury 13.4 22.9 0.6 NaN NaN W 44.0 W WNW ... 24.0 71.0 22.0 1007.7 1007.1 8.0 NaN 16.9 21.8 No
1 Albury 7.4 25.1 0.0 NaN NaN WNW 44.0 NNW WSW ... 22.0 44.0 25.0 1010.6 1007.8 NaN NaN 17.2 24.3 No
2 Albury 12.9 25.7 0.0 NaN NaN WSW 46.0 W WSW ... 26.0 38.0 30.0 1007.6 1008.7 NaN 2.0 21.0 23.2 No
3 Albury 9.2 28.0 0.0 NaN NaN NE 24.0 SE E ... 9.0 45.0 16.0 1017.6 1012.8 NaN NaN 18.1 26.5 No
4 Albury 17.5 32.3 1.0 NaN NaN W 41.0 ENE NW ... 20.0 82.0 33.0 1010.8 1006.0 7.0 8.0 17.8 29.7 No
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
144548 Uluru 16.9 33.2 0.0 NaN NaN SSE 43.0 ESE SSE ... 26.0 22.0 13.0 1014.1 1009.8 NaN NaN 23.7 31.8 No
144549 Uluru 15.1 36.8 0.0 NaN NaN NE 31.0 ENE SW ... 20.0 16.0 8.0 1012.6 1007.6 NaN NaN 28.9 34.8 No
144550 Uluru 17.3 37.8 0.0 NaN NaN ESE 39.0 ESE SSE ... 9.0 15.0 8.0 1011.9 1008.0 NaN NaN 29.7 35.7 No
144551 Uluru 20.1 38.5 0.0 NaN NaN ESE 43.0 ESE SSW ... 17.0 22.0 9.0 1014.0 1009.2 NaN NaN 29.8 37.2 No
144552 Uluru 22.5 39.6 0.0 NaN NaN WNW 76.0 ENE SSW ... 13.0 16.0 9.0 1012.1 1006.2 NaN NaN 30.1 37.4 No

97988 rows × 21 columns

In [57]:
import numpy as np
numeric_col = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_col = train_inputs.select_dtypes('object').columns.tolist()
In [59]:
numeric_col
Out[59]:
['MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm']
In [61]:
categorical_col
Out[61]:
['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']
In [63]:
train_inputs[numeric_col].describe()
Out[63]:
MinTemp MaxTemp Rainfall Evaporation Sunshine WindGustSpeed WindSpeed9am WindSpeed3pm Humidity9am Humidity3pm Pressure9am Pressure3pm Cloud9am Cloud3pm Temp9am Temp3pm
count 97674.000000 97801.000000 97988.000000 61657.000000 57942.000000 91160.000000 97114.000000 96919.000000 96936.000000 96872.000000 88876.000000 88857.000000 63000.000000 61966.000000 97414.000000 97392.000000
mean 12.007831 23.022202 2.372935 5.289991 7.609004 40.215873 14.092263 18.764608 68.628745 51.469547 1017.513734 1015.132352 4.302952 4.410677 16.835126 21.540138
std 6.347175 6.984397 8.518819 3.952010 3.788813 13.697967 8.984203 8.872398 19.003097 20.756113 7.072510 6.997072 2.866634 2.693370 6.404586 6.831612
min -8.500000 -4.100000 0.000000 0.000000 0.000000 6.000000 0.000000 0.000000 0.000000 0.000000 980.500000 979.000000 0.000000 0.000000 -5.900000 -5.100000
25% 7.500000 17.900000 0.000000 2.600000 4.800000 31.000000 7.000000 13.000000 57.000000 37.000000 1012.800000 1010.400000 1.000000 2.000000 12.200000 16.600000
50% 11.800000 22.400000 0.000000 4.600000 8.500000 39.000000 13.000000 19.000000 70.000000 52.000000 1017.500000 1015.100000 5.000000 5.000000 16.600000 20.900000
75% 16.600000 27.900000 0.800000 7.200000 10.600000 48.000000 19.000000 24.000000 83.000000 66.000000 1022.300000 1019.900000 7.000000 7.000000 21.400000 26.200000
max 33.900000 48.100000 371.000000 82.400000 14.300000 135.000000 87.000000 87.000000 100.000000 100.000000 1041.000000 1039.600000 9.000000 9.000000 40.200000 46.100000
In [64]:
train_inputs[categorical_col].describe()
Out[64]:
Location WindGustDir WindDir9am WindDir3pm RainToday
count 97988 91120 90969 96036 97988
unique 49 16 16 16 2
top Canberra W N SE No
freq 2506 6672 8012 7603 76002
In [65]:
train_inputs[categorical_col].nunique()
Out[65]:
Location       49
WindGustDir    16
WindDir9am     16
WindDir3pm     16
RainToday       2
dtype: int64
In [66]:
from sklearn.impute import SimpleImputer
In [68]:
imputer = SimpleImputer(strategy = 'mean')
In [69]:
df[numeric_col].isna().sum()
Out[69]:
MinTemp            468
MaxTemp            307
Rainfall             0
Evaporation      59694
Sunshine         66805
WindGustSpeed     9105
WindSpeed9am      1055
WindSpeed3pm      2531
Humidity9am       1517
Humidity3pm       3501
Pressure9am      13743
Pressure3pm      13769
Cloud9am         52625
Cloud3pm         56094
Temp9am            656
Temp3pm           2624
dtype: int64
In [71]:
 imputer.fit(df[numeric_col])
Out[71]:
SimpleImputer()
In [72]:
list(imputer.statistics_)
Out[72]:
[12.18482386562048,
 23.235120301822324,
 2.349974074310839,
 5.472515506887154,
 7.630539861047281,
 39.97051988882308,
 13.990496092519967,
 18.631140782316862,
 68.82683277087672,
 51.44928834695453,
 1017.6545771543717,
 1015.2579625879797,
 4.431160817585808,
 4.499250233195188,
 16.98706638787991,
 21.69318269001107]
In [73]:
train_inputs[numeric_col] = imputer.transform(train_inputs[numeric_col])
val_inputs[numeric_col] = imputer.transform(val_inputs[numeric_col])
test_inputs[numeric_col] = imputer.transform(test_inputs[numeric_col])
In [75]:
train_inputs[numeric_col].isna().sum()
Out[75]:
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustSpeed    0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
dtype: int64
In [78]:
from sklearn.preprocessing import oneHotEncoder
---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Input In [78], in <cell line: 1>()
----> 1 from sklearn.preprocessing import oneHotEncoder

ImportError: cannot import name 'oneHotEncoder' from 'sklearn.preprocessing' (C:\Users\Vikas Yadav\anaconda3\lib\site-packages\sklearn\preprocessing\__init__.py)
In [ ]: